We want to look at real datasets to simulate realistic datasets.

data("cortical")
pathLabels = "~/Documents/BRAIN/gitrepo/brainData/analysis/results_160705/cluster_labels.txt"
labels <- read.table(pathLabels, stringsAsFactors = FALSE, sep='\t',
                     comment.char = "%")

prefilter <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")),])]
filter <- apply(assay(prefilter)>10, 1, sum)>=10

postfilter <- prefilter[filter,]
batch <- droplevels(colData(postfilter)$MD_c1_run_id)
qc <- as.matrix(colData(postfilter)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~ batch + qcpca$x[, 1:2])
bio <- factor(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")), "ClusterMergedColor"])
postfilter = assay(postfilter)

vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)

We only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. We will color-code the cells by either known cell type or by inferred cluster (inferred in the original study). Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells. Number of retained genes:

print(sum(filter))
## [1] 7591

To speed up the computations, I will focus on the top 1,000 most variable genes. IS IT A GOOD IDEA?

mean = apply(log1p(assay(prefilter)), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(assay(prefilter) == 0), xlim = c(1,11), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = 'no filtering')

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow = c(1,3))
smoothScatter(mean, rowMeans(assay(prefilter) == 0), xlim = c(0,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp =pal, main = 'no filtering')

mean = apply(log1p(postfilter), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(postfilter == 0),xlim = c(1,11),
              nbin = 256, nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'after filtering')

mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Pink and glia cells have been removed. Fit data with K = 3, V intercepts in both Mu and Pi, commondispersion = FALSE, with X accounting for batch, qc.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 3, X = mod,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 693.297 313.229 425.589

True W

If we simulate W from real data, it will look like that.

pairs(zinb@W, col=cols[bio], pch=19, main="W\ncolor = analysis results 160705")

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2439368      0.5252101  0.9883636
## gamma_pi       -0.2439368  1.0000000     -0.8470762 -0.2861173
## detection_rate  0.5252101 -0.8470762      1.0000000  0.5482978
## coverage        0.9883636 -0.2861173      0.5482978  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

Because we accounting for batch and qc in X, beta has M = 25 rows.

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu_intercept = beta_mu,
                 beta_pi_intercept = beta_pi)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                   beta_mu_intercept beta_pi_intercept
## beta_mu_intercept         1.0000000        -0.3610809
## beta_pi_intercept        -0.3610809         1.0000000
beta = data.frame(beta_mu_intercept = beta_mu, beta_pi_intercept = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1314 rows containing non-finite values (stat_boxplot).

Alpha

pairs(t(zinb@alpha_mu))

pairs(t(zinb@alpha_pi))

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_mu_3 = zinb@alpha_mu[3, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ],
                 alpha_pi_3 = zinb@alpha_pi[3, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##              alpha_mu_1   alpha_mu_2   alpha_mu_3   alpha_pi_1
## alpha_mu_1  1.000000000 -0.002558535 -0.118975103 -0.300629485
## alpha_mu_2 -0.002558535  1.000000000 -0.007490167 -0.051727888
## alpha_mu_3 -0.118975103 -0.007490167  1.000000000  0.049573562
## alpha_pi_1 -0.300629485 -0.051727888  0.049573562  1.000000000
## alpha_pi_2 -0.020078504 -0.104977173 -0.067802696 -0.003056859
## alpha_pi_3  0.141028173 -0.090183390 -0.307602548 -0.040068676
##              alpha_pi_2  alpha_pi_3
## alpha_mu_1 -0.020078504  0.14102817
## alpha_mu_2 -0.104977173 -0.09018339
## alpha_mu_3 -0.067802696 -0.30760255
## alpha_pi_1 -0.003056859 -0.04006868
## alpha_pi_2  1.000000000  0.08323482
## alpha_pi_3  0.083234819  1.00000000

Dispersion

Dispersion was estimated on FQ norm counts using function estimateDisp from edgeR. Dispersions estimated using edgeR and ZINB does not seem to be on the same scale. I think it is because I estimate dispersion on (normalized) counts with edgeR, but (if I understood well) zinb estimate the dispersion on log1p(counts). We should rely on values of dispersion from edgeR as we are going to simulates the counts with \[Y_{i,j} \sim NB(\mu_{i,j}, \phi_j)\]

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, zinb@zeta, ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, zinb@zeta, method="spearman"))
## [1] -0.7247025
plot(density(disp$tagwise.dispersion), main = 'edgeR tagwise dispersion')

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=cols[bio])

par(mfrow=c(1, 3))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'True')
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
     xlab = "Estimated Mean Expression", xlim = c(2,8),
     ylab = "Estimated Dropout Rate", pch=19, col=cols[bio],
     ylim = c(0, 1), main = 'Estimated')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
              xlab = "Estimated Mean Expression", main = 'Estimated',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)